/************************************************************************
Purpose: 	Create a plant-period NeML Inventory data set.

Notes:      The final product is a plant-period NeML inventory data set 
		    (N=156, T=10) with following columns:
			* gpcb_id: plant unique id
			* commodity: commodity unique id
			* allocated_quantity: initial permit allocation (kg)
			* sold: permits sold in a period (kg)
			* purchase: permits purchased in a period (kg)
			* consumed: validated emissions in a period (kg)
			* current_inventory: permits left by the end of a period (kg)
*************************************************************************/

set more off
clear all
pause on

********************************************************************************
*** Generate a data set of trading plants baseline covariates
********************************************************************************

use "$EMISSIONS_DATA_OUT/Rule0_Panel.dta", clear
drop if D_treatment == 0
keep gpcb_id D_cyc D_scr D_bf D_esp cyc_max bf_max scr_esp_max num_cyclones num_scrubbers num_bagfilters num_esps plant_total_heatoutput ln_plant_total_heatoutput
duplicates drop

order gpcb_id plant_total_heatoutput ln_plant_total_heatoutput
label var gpcb_id "Plant GPCB ID"
save "$TRADING_DATA_OUT/covariates_plant.dta", replace

keep gpcb_id
save "$TRADING_DATA_OUT/index_plant.dta", replace

********************************************************************************
*** Clean NeML Inventory data
********************************************************************************

import delimited "$TRADING_DATA_IN/NeMLinventory_Sept2019-Mar2021.csv", clear
gen mydate = date(date,"YMD") 
sort mydate gpcb_id
drop date 
rename mydate date
format date %tddd-Mon-CCYY

gen commodity_period = .
replace commodity_period = 1 if commodity == "PSUM160919"
replace commodity_period = 2 if commodity == "PSUM161019"
replace commodity_period = 3 if commodity == "PSUM161119"
replace commodity_period = 4 if commodity == "PSUM010120"
replace commodity_period = 5 if commodity == "PSUM010220"
replace commodity_period = 6 if commodity == "PSUM010320"
replace commodity_period = 7 if commodity == "PSUM011220"
replace commodity_period = 8 if commodity == "PSUM010121"
replace commodity_period = 9 if commodity == "PSUM010221"
replace commodity_period = 10 if commodity == "PSUM010321"
drop if commodity_period == .

preserve
keep commodity commodity_period
duplicates drop
label var commodity "Commodity ID"
label var commodity_period "Commodity period number"
save "$TRADING_DATA_OUT/index_commodity-period.dta", replace
restore

********************************************************************************
*** Check duplicated record
********************************************************************************

gsort gpcb_id commodity_period date sold purchased
gen temp = 1
bysort gpcb_id commodity date: egen sum_temp = sum(temp)
gsort gpcb_id commodity_period date sold purchased

********************************************************************************
*** Output a plant-commodity-date panel
********************************************************************************

by gpcb_id commodity_period date: gen last_record_of_day = (_n == _N)
keep if last_record_of_day == 1
drop last_record_of_day temp sum_temp
order gpcb_id commodity commodity_period date

replace gpcb_id = 1732028288 if gpcb_id == 158777056
destring gpcb_id, replace
* 43,021 (plant, commodity, day) observations

label var gpcb_id "Plant GPCB ID"
label var commodity "Commodity ID"
label var commodity_period "Commodity period number"
label var date "Date"
label var allocated_quantity "Permits allocated (kg) from NeML inventory data"
label var blocked "Quantities of bids blocked (kg) from NeML inventory data"
label var sold "Quantities of bids sold (kg) from NeML inventory data"
label var purchased "Quantities of bids purchased (kg) from NeML inventory data"
label var consumed "Quantities of bids consumed (kg) from NeML inventory data"
label var current_inventory "Quantities of bids not consumed (kg) from NeML inventory data"

preserve
* Undo GPCB's period VII covid policy in which they reduced emissions
* for certain plants during the true-up period. 
* Calculate cumulative maximum of consumed emissions.
* Save for later use
keep if commodity_period == 7

gen cummax_consumed = consumed
sort commodity_period gpcb_id date
bys commodity_period gpcb_id: replace cummax_consumed = /// 
	cond(cummax_consumed[_n-1] > cummax_consumed & !missing(cummax_consumed[_n-1]), ///
	cummax_consumed[_n-1], cummax_consumed[_n]) 
gen consumed_noadjust = cummax_consumed

* Only keep plants that had adjusted emissions and only keep emissions at the end of the 
* period
keep if consumed_noadjust != consumed
gsort gpcb_id commodity_period date
bysort gpcb_id commodity_period: gen last_record_of_period = (_n == _N)
keep if last_record_of_period == 1

* A few formatting modifications to make things easier down the line
gen period_num = 7
rename consumed_noadjust emission_val_noadjust
gen emission_val_noadjust_prorated = emission_val_noadjust * 30/31

keep gpcb_id period_num emission_val_noadjust_prorated

label var gpcb_id "Plant GPCB ID"
label var period_num "Commodity period number"
label var emission_val_noadjust "Validate total emissions (monthly pro-rated kg) -- No GPCB period VII adjustment"

label data "Period VII unadjusted emissions. See footnote 22 in the paper for details"
save "$TRADING_DATA_CLEAN/panel_plant-period_unadjusted_emissions.dta", replace

restore	
	
save "$TRADING_DATA_OUT/inventory_plant-commodity-date.dta", replace

********************************************************************************
*** Aggregate to a plant-period panel
********************************************************************************

gsort gpcb_id commodity_period date
bysort gpcb_id commodity_period: gen last_record_of_period = (_n == _N)
keep if last_record_of_period == 1
drop date last_record_of_period
order gpcb_id commodity commodity_period allocated_quantity blocked sold purchased consumed current_inventory
// save "$WORKING/NeML_inventory.dta", replace

* Merge gpcb_ids to keep only in-sample plants
merge m:1 gpcb_id using "$TRADING_DATA_OUT/covariates_plant.dta", keepusing(gpcb_id)
keep if _merge == 3
drop _merge

* 156 plants x 10 commodity periods = 1560 obserations

rename blocked blocked_period
rename sold sold_period
rename purchased purchased_period
rename consumed consumed_period
rename current_inventory final_inventory_period
save "$TRADING_DATA_OUT/inventory_plant-commodity.dta", replace

********************************************************************************
*** Check Inconsistency
********************************************************************************

use "$TRADING_DATA_OUT/inventory_plant-commodity.dta", clear
gen my_final_inventory = allocated_quantity + purchased_period - sold_period - consumed_period
sort commodity_period gpcb_id
* br if my_final_inventory != final_inventory_period
* 138 / 1560 plant-period observations are inconsistent.
* All of them are PSUM010320 (commodity period 6).
